Loading packages:

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#install.packages("ggh4x")
install.packages("ggpointdensity", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/0_/80b5wwrn45g63fjxqr25xmpr0000gn/T//RtmpDxxCDb/downloaded_packages
#library(ggh4x)
library(ggpointdensity)
#library(ggplot2)

Read in the data

df_all <- readr::read_csv("final_project_train.csv", col_names = TRUE)
## Rows: 677 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): region, customer, outcome
## dbl (35): rowid, xb_01, xb_02, xb_03, xn_01, xn_02, xn_03, xa_01, xa_02, xa_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_all %>% glimpse()
## Rows: 677
## Columns: 38
## $ rowid    <dbl> 1, 3, 4, 5, 8, 9, 11, 14, 15, 16, 17, 18, 19, 22, 24, 25, 27,…
## $ region   <chr> "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "…
## $ customer <chr> "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "…
## $ xb_01    <dbl> 4.000000, 1.000000, 2.000000, 2.520000, 2.548387, 3.071429, 3…
## $ xb_02    <dbl> 4, 1, 2, 11, 6, 6, 10, 12, 9, 10, 8, 10, 10, 8, 6, 10, 13, 10…
## $ xb_03    <dbl> 4, 1, 2, -6, -1, 1, -4, -4, -2, -4, -2, -2, -2, -4, 1, -4, -3…
## $ xn_01    <dbl> 3.0000000, 2.0000000, 2.0000000, 1.5333333, 0.8387097, 1.8571…
## $ xn_02    <dbl> 3, 2, 4, 9, 3, 8, 6, 10, 10, 4, 6, 8, 9, 5, 7, 12, 12, 6, 6, …
## $ xn_03    <dbl> 3, 2, 0, -3, -4, -2, -5, -6, -3, -5, -3, -6, -4, -3, 0, -5, -…
## $ xa_01    <dbl> 12.000000, 3.000000, 9.000000, 7.080000, 6.451613, 6.857143, …
## $ xa_02    <dbl> 12, 3, 9, 29, 17, 18, 24, 27, 20, 19, 15, 24, 24, 15, 14, 26,…
## $ xa_03    <dbl> 12, 3, 9, -7, -2, 2, -9, -5, -3, -3, -1, 1, -2, -3, 3, -4, -5…
## $ xb_04    <dbl> 1.3333333, 1.0000000, 1.0000000, 0.8950476, 1.2247312, 1.1857…
## $ xb_05    <dbl> 1.3333333, 1.0000000, 1.0000000, -2.0000000, -0.5000000, 0.00…
## $ xb_06    <dbl> 1.333333, 1.000000, 1.000000, 4.000000, 4.000000, 3.000000, 6…
## $ xb_07    <dbl> 4.000000, 1.000000, 2.000000, 1.933333, 1.967742, 1.714286, 1…
## $ xb_08    <dbl> -1.00000000, 1.00000000, 0.00000000, -0.08000000, 0.35483871,…
## $ xn_04    <dbl> 1.0000000, 2.0000000, 1.0000000, 0.5268889, 0.4688172, 0.5607…
## $ xn_05    <dbl> 1.0000000, 2.0000000, 0.0000000, -1.0000000, -1.3333333, -1.0…
## $ xn_06    <dbl> 1.0, 2.0, 2.0, 2.5, 3.0, 2.0, 4.0, 4.0, 3.0, 2.0, 2.0, 2.5, 2…
## $ xn_07    <dbl> 3.000000, 2.000000, 2.500000, 1.493333, 1.225806, 1.642857, 1…
## $ xn_08    <dbl> -1.0000000, 2.0000000, -1.0000000, -0.4400000, -0.4516129, -0…
## $ xa_04    <dbl> 6.000000, 3.000000, 6.750000, 2.425333, 3.023656, 2.685714, 2…
## $ xa_05    <dbl> 6.0000000, 3.0000000, 4.5000000, -3.5000000, -0.6666667, 0.40…
## $ xa_06    <dbl> 6.000000, 3.000000, 9.000000, 9.000000, 13.000000, 6.000000, …
## $ xa_07    <dbl> 9.000000, 3.000000, 7.500000, 4.466667, 4.612903, 4.071429, 4…
## $ xa_08    <dbl> 3.0000000, 3.0000000, 6.0000000, 0.7066667, 1.3225806, 1.3571…
## $ xw_01    <dbl> 23.00000, 17.00000, 52.50000, 64.52564, 54.75758, 58.33333, 6…
## $ xw_02    <dbl> 23, 17, 48, 0, 12, 15, 0, 0, 0, 7, 14, 0, 0, 0, 8, 8, 0, 4, 2…
## $ xw_03    <dbl> 23, 17, 57, 106, 105, 101, 107, 109, 109, 104, 109, 99, 103, …
## $ xs_01    <dbl> 0.262073307, 0.330804757, 0.239795763, 0.142106837, 0.2442957…
## $ xs_02    <dbl> 0.26207331, 0.33080476, 0.19049123, -0.73321509, -0.12204299,…
## $ xs_03    <dbl> 0.2620733, 0.3308048, 0.2891003, 0.5500723, 1.3134719, 0.6540…
## $ xs_04    <dbl> 0.5375576, 0.4286607, 0.3676937, 0.2865445, 0.2375470, 0.2594…
## $ xs_05    <dbl> 0.5375575604, 0.4286607050, 0.2485001680, 0.0000000000, 0.043…
## $ xs_06    <dbl> 0.5375576, 0.4286607, 0.4868872, 0.6357541, 0.4327004, 0.8672…
## $ response <dbl> 2.617991, 1.184632, 2.216626, 2.726715, 1.483323, 2.039279, 1…
## $ outcome  <chr> "non_event", "non_event", "event", "non_event", "non_event", …

High level summary of the data. It is important to note that rowid, region, customer are categorical inputs. All of the sentiment derived features that begin with x are continuous inputs. Additionally, response is the continuous output, and outcome is the categorical output.

df_all %>% summary()
##      rowid           region            customer             xb_01       
##  Min.   :   1.0   Length:677         Length:677         Min.   :-4.000  
##  1st Qu.: 312.0   Class :character   Class :character   1st Qu.: 2.333  
##  Median : 647.0   Mode  :character   Mode  :character   Median : 3.250  
##  Mean   : 648.2                                         Mean   : 3.377  
##  3rd Qu.: 972.0                                         3rd Qu.: 4.250  
##  Max.   :1324.0                                         Max.   :14.000  
##      xb_02            xb_03            xn_01             xn_02       
##  Min.   :-4.000   Min.   :-7.000   Min.   :-4.0000   Min.   :-4.000  
##  1st Qu.: 3.000   1st Qu.:-1.000   1st Qu.: 0.7917   1st Qu.: 2.000  
##  Median : 6.000   Median : 1.000   Median : 1.6000   Median : 4.000  
##  Mean   : 5.749   Mean   : 1.217   Mean   : 1.5581   Mean   : 3.665  
##  3rd Qu.: 8.000   3rd Qu.: 3.000   3rd Qu.: 2.4000   3rd Qu.: 6.000  
##  Max.   :15.000   Max.   :14.000   Max.   :10.0000   Max.   :13.000  
##      xn_03             xa_01            xa_02           xa_03       
##  Min.   :-7.0000   Min.   :-3.000   Min.   :-3.00   Min.   :-9.000  
##  1st Qu.:-2.0000   1st Qu.: 6.000   1st Qu.: 8.00   1st Qu.: 0.000  
##  Median :-1.0000   Median : 8.000   Median :13.00   Median : 3.000  
##  Mean   :-0.4018   Mean   : 8.073   Mean   :13.24   Mean   : 3.836  
##  3rd Qu.: 1.0000   3rd Qu.: 9.750   3rd Qu.:18.00   3rd Qu.: 7.000  
##  Max.   :10.0000   Max.   :35.000   Max.   :38.00   Max.   :35.000  
##      xb_04            xb_05             xb_06            xb_07       
##  Min.   :-2.000   Min.   :-3.0000   Min.   :-2.000   Min.   :-1.000  
##  1st Qu.: 0.850   1st Qu.:-0.3333   1st Qu.: 1.200   1st Qu.: 1.667  
##  Median : 1.138   Median : 0.4000   Median : 2.000   Median : 2.000  
##  Mean   : 1.153   Mean   : 0.4079   Mean   : 2.107   Mean   : 2.097  
##  3rd Qu.: 1.428   3rd Qu.: 1.0000   3rd Qu.: 3.000   3rd Qu.: 2.500  
##  Max.   : 5.000   Max.   : 5.0000   Max.   : 9.000   Max.   : 7.000  
##      xb_08             xn_04             xn_05             xn_06       
##  Min.   :-4.0000   Min.   :-4.0000   Min.   :-4.0000   Min.   :-4.000  
##  1st Qu.:-0.2500   1st Qu.: 0.2678   1st Qu.:-1.0000   1st Qu.: 0.800  
##  Median : 0.2051   Median : 0.6000   Median :-0.2500   Median : 1.250  
##  Mean   : 0.2124   Mean   : 0.6038   Mean   :-0.1584   Mean   : 1.479  
##  3rd Qu.: 1.0000   3rd Qu.: 1.0000   3rd Qu.: 0.5000   3rd Qu.: 2.000  
##  Max.   : 5.0000   Max.   : 5.0000   Max.   : 5.0000   Max.   : 7.000  
##      xn_07            xn_08              xa_04            xa_05       
##  Min.   :-4.000   Min.   :-4.00000   Min.   :-2.000   Min.   :-8.000  
##  1st Qu.: 1.000   1st Qu.:-1.00000   1st Qu.: 2.252   1st Qu.: 0.000  
##  Median : 1.400   Median :-0.30769   Median : 2.925   Median : 1.333  
##  Mean   : 1.406   Mean   :-0.26713   Mean   : 2.945   Mean   : 1.380  
##  3rd Qu.: 1.833   3rd Qu.: 0.03704   3rd Qu.: 3.500   3rd Qu.: 2.667  
##  Max.   : 5.000   Max.   : 5.00000   Max.   :12.000   Max.   :12.000  
##      xa_06            xa_07            xa_08            xw_01       
##  Min.   :-2.000   Min.   :-2.000   Min.   :-5.000   Min.   :  9.00  
##  1st Qu.: 3.000   1st Qu.: 3.882   1st Qu.: 0.400   1st Qu.: 44.36  
##  Median : 4.333   Median : 4.613   Median : 1.140   Median : 57.41  
##  Mean   : 5.149   Mean   : 4.699   Mean   : 1.221   Mean   : 57.02  
##  3rd Qu.: 6.500   3rd Qu.: 5.400   3rd Qu.: 2.000   3rd Qu.: 67.50  
##  Max.   :23.000   Max.   :13.000   Max.   :12.000   Max.   :108.00  
##      xw_02            xw_03            xs_01             xs_02         
##  Min.   :  0.00   Min.   :  9.00   Min.   :-0.3612   Min.   :-0.89585  
##  1st Qu.:  9.00   1st Qu.: 58.00   1st Qu.: 0.1449   1st Qu.:-0.14236  
##  Median : 24.00   Median : 93.00   Median : 0.2160   Median : 0.03546  
##  Mean   : 31.87   Mean   : 79.07   Mean   : 0.2148   Mean   : 0.02228  
##  3rd Qu.: 49.00   3rd Qu.:101.00   3rd Qu.: 0.2839   3rd Qu.: 0.19274  
##  Max.   :108.00   Max.   :113.00   Max.   : 0.7548   Max.   : 0.69105  
##      xs_03             xs_04            xs_05             xs_06       
##  Min.   :-0.3612   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.: 0.2412   1st Qu.:0.2438   1st Qu.:0.07934   1st Qu.:0.3040  
##  Median : 0.3870   Median :0.2908   Median :0.16213   Median :0.4324  
##  Mean   : 0.4241   Mean   :0.3011   Mean   :0.18863   Mean   :0.4666  
##  3rd Qu.: 0.5940   3rd Qu.:0.3429   3rd Qu.:0.26336   3rd Qu.:0.5948  
##  Max.   : 1.7907   Max.   :0.8988   Max.   :0.89883   Max.   :1.3088  
##     response         outcome         
##  Min.   : 0.5725   Length:677        
##  1st Qu.: 1.5615   Class :character  
##  Median : 2.2896   Mode  :character  
##  Mean   : 2.6756                     
##  3rd Qu.: 3.2764                     
##  Max.   :22.9219

Categorical Visualization of Counts

df_all %>% ggplot(mapping=aes(x=customer)) + geom_bar()

df_all %>% ggplot(mapping=aes(x=region)) + geom_bar()

df_all %>% ggplot(mapping=aes(x=outcome)) + geom_bar()

Continuous Visualizations:

The AFINN derived features look gaussian like.

df_all_pivot_xa <- df_all %>% select(starts_with("xa")) %>%  rowid_to_column() %>% pivot_longer(!c("rowid"))
df_all_pivot_xa %>% ggplot(mapping=aes(x=value)) + geom_histogram(binwidth = 1)  + facet_wrap(~name, scales = "free")

df_all_pivot_xa %>% ggplot(mapping=aes(x=name, y=value)) + geom_violin(fill = 'grey')  

df_graph <- df_all %>% select(starts_with("xa"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer"))

df_graph %>% ggplot(mapping = aes(x=value)) + geom_density(mapping = aes(group = interaction(region),
                             color = as.factor(region)), size = 1.2, adjust = 1.35) +
                              facet_wrap(~ name, labeller = "label_both", scales = "free") 

The Bing derived features look Gaussian like.

df_all_pivot_xb <- df_all %>% select(starts_with("xb")) %>%  rowid_to_column() %>% pivot_longer(!c("rowid"))
df_all_pivot_xb %>% ggplot(mapping=aes(x=value)) + geom_histogram(binwidth = 1)  + facet_wrap(~name, scales = "free")

df_all_pivot_xb %>% ggplot(mapping=aes(x=name, y=value)) + geom_violin(fill = 'grey')  

df_all %>% select(starts_with("xb"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer")) %>% 
ggplot(mapping = aes(x=value)) + geom_density(mapping = aes(group = interaction(region),
                             color = as.factor(region)), size = 1.2, adjust = 1.35) +
                              facet_wrap(~ name, labeller = "label_both", scales = "free") 

The NRC derived features look Gaussian like.

df_all_pivot_xn <- df_all %>% select(starts_with("xn")) %>%  rowid_to_column() %>% pivot_longer(!c("rowid"))
df_all_pivot_xn %>% ggplot(mapping=aes(x=value)) + geom_histogram(binwidth = 1)  + facet_wrap(~name, scales = "free")

df_all_pivot_xn %>% ggplot(mapping=aes(x=name, y=value)) + geom_violin(fill = 'grey')  

df_all %>% select(starts_with("xn"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer")) %>% 
ggplot(mapping = aes(x=value)) + geom_density(mapping = aes(group = interaction(region),
                             color = as.factor(region)), size = 1.2, adjust = 1.35) +
                              facet_wrap(~ name, labeller = "label_both", scales = "free") 

The Word 01 derived feature looks Gaussian like, but the other two features don’t.

df_all_pivot_xw <- df_all %>% select(starts_with("xw")) %>%  rowid_to_column() %>% pivot_longer(!c("rowid"))
df_all_pivot_xw %>% ggplot(mapping=aes(x=value)) + geom_histogram(binwidth = 3)  + facet_wrap(~name, scales = "free")

df_all_pivot_xw %>% ggplot(mapping=aes(x=name, y=value)) + geom_violin(fill = 'grey')  

df_all %>% select(starts_with("xw"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer")) %>% 
ggplot(mapping = aes(x=value)) + geom_density(mapping = aes(group = interaction(region),
                             color = as.factor(region)), size = 1.2, adjust = 1.35) +
                              facet_wrap(~ name, labeller = "label_both", scales = "free") 

df_all_pivot_xs <- df_all %>% select(starts_with("xs")) %>%  rowid_to_column() %>% pivot_longer(!c("rowid"))
df_all_pivot_xs %>% ggplot(mapping=aes(x=value)) + geom_histogram(binwidth = .03)  + facet_wrap(~name, scales = "free")

df_all_pivot_xs %>% ggplot(mapping=aes(x=name, y=value)) + geom_violin(fill = 'grey')  

df_all %>% select(starts_with("xs"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer")) %>% 
ggplot(mapping = aes(x=value)) + geom_density(mapping = aes(group = interaction(region),
                             color = as.factor(region)), size = 1.2, adjust = 1.35) +
                              facet_wrap(~ name, labeller = "label_both", scales = "free") 

The ‘response’ variable doesn’t appear Gaussian, but if you log transform it the distribution is more recognizable.

df_all %>% ggplot(mapping=aes(x=response)) + geom_histogram(bins = 25) 

df_all %>% ggplot(mapping=aes(x=log(response))) + geom_histogram(binwidth = .1) 

Conditioned Continuous Variables

Conditioned on Region and Customer

It looks like different regions are contributing to different AFINN features. For example for xa_01, region ZZ contributes much less than the other two. However, it is the opposite for xa_06. At the same time when looking at summary stats for xa_03, region ZZ has the middle 50% in the positive compared to the rest of the regions. With regards to customer, the features seem similar except for xa_02. The summary stats for that feature vary when broken out by customer. Customer A has the largest middle 50% across the board.

# df_all %>% select(starts_with("xa"), region) %>%  rowid_to_column() %>% pivot_longer(!c("rowid"))
df_graph <- df_all %>% select(starts_with("xa"), region, customer, outcome)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "outcome"))

df_graph %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(region), color= as.factor(region)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") + 
  scale_fill_viridis_d("region") +
  scale_color_viridis_d("region") +
  theme_bw()

df_graph %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(customer), color= as.factor(customer)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") + 
  scale_fill_viridis_d("customer") +
  scale_color_viridis_d("customer") +
  theme_bw()

Region YY seems like the stronger contributor for the Bing features when there is a standout. Otherwise, they are similar. Once again customer ‘A’ seems to have a large range for the middle 50%, and is the only customer that the middle 50% is completely in the positive range for all of the features. For xb_02, the first half of the customers have a dramatically higher value than the later half of customers. For many of the features, the customers all look similar. The regions don’t seem to matter as well. Only region ZZ is slightly different than some of the other regions.

df_all %>% select(starts_with("xb"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer")) %>% 
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(region), color= as.factor(region)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") + 
  scale_fill_viridis_d("region") +
  scale_color_viridis_d("region") +
  theme_bw()

df_all %>% select(starts_with("xb"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer")) %>% 
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(customer), color= as.factor(customer)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") + 
  scale_fill_viridis_d("customer") +
  scale_color_viridis_d("customer") +
  theme_bw()

For NRC lexicon, the densities all seem similar except for xn_01, xn_04, xn_08. Those features have less contribution from region ZZ. The NRC lexicon seems to follow the same patterns as the features above. The 2nd feature has the most variability between the regions and customers compared to the other features.

df_all %>% select(starts_with("xn"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer")) %>% 
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(region), color= as.factor(region)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") + 
  scale_fill_viridis_d("region") +
  scale_color_viridis_d("region") +
  theme_bw()

df_all %>% select(starts_with("xn"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer")) %>% 
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(customer), color= as.factor(customer)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") + 
  scale_fill_viridis_d("customer") +
  scale_color_viridis_d("customer") +
  theme_bw()

For the Word lexicon, we have very unique distributions. xw_02 and xw_03 don’t appear Gaussian, and region ZZ isn’t a strong contributor. Region ZZ has the widest range for the middle 50%, and has a dramatically different median.

df_all %>% select(starts_with("xw"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer")) %>% 
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(region), color= as.factor(region)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") + 
  scale_fill_viridis_d("region") +
  scale_color_viridis_d("region") +
  theme_bw()

df_all %>% select(starts_with("xw"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer")) %>% 
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(customer), color= as.factor(customer)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") + 
  scale_fill_viridis_d("customer") +
  scale_color_viridis_d("customer") +
  theme_bw()

For sentimentr derived features, region ZZ is a strong contributor to the density compared to the other features. Once again region ZZ stands out for the 2nd feature, and the last feature. This is something to note later when fitting models.

df_all %>% select(starts_with("xs"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer")) %>% 
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(region), color= as.factor(region)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") + 
  scale_fill_viridis_d("region") +
  scale_color_viridis_d("region") +
  theme_bw()

df_all %>% select(starts_with("xs"), region, customer)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer")) %>% 
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(customer), color= as.factor(customer)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") + 
  scale_fill_viridis_d("customer") +
  scale_color_viridis_d("customer") +
  theme_bw()

Conditioned on Outcome

Feature 2 has the widest range that includes negative and positive values with the widest middle 50%. But as we know above, it may depend on different regions or customers. It also has the ‘largest’ outliers.

df_outcome <- df_all %>% select(starts_with("xa"), region, customer, outcome)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer","outcome"))

df_outcome %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(outcome), color= as.factor(outcome)), alpha=0.35)  + 
  theme_bw()

df_outcome %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_wrap(~name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

df_outcome %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_grid( region ~name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

df_outcome %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_grid( customer ~ name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

For the bing features, the 3rd one seems the most likely to have a negative or low value for the sentiment score. Features 2 and 3 have the largest middle 50% as well. However, it is important to note that the median for all of the features are similar between the event vs non-event.

df_outcome_b <- df_all %>% select(starts_with("xb"), region, customer, outcome)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "outcome"))

df_outcome_b %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(outcome), color= as.factor(outcome)), alpha=0.35)  + 
  theme_bw()

df_outcome_b %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_wrap(~name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

df_outcome_b %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_grid( region ~name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

df_outcome_b %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_grid( customer ~ name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

Once again the second feature has the widest middle 50%, with the largest outliers. The median for the non-event is well outside the middle 50% for the event on features 1, 2, 3. THe NRC lexicon also has the non_event values much higher across the board compared to the event values.

df_outcome_n <- df_all %>% select(starts_with("xn"), region, customer, outcome)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "outcome")) 

df_outcome_n %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(outcome), color= as.factor(outcome)), alpha=0.35)  + 
  theme_bw()

df_outcome_n %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_wrap(~name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

df_outcome_n %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_grid( region ~name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

df_outcome_n %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_grid( customer ~ name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

It is interesting to see that for the Word count derived features, the summary statistics of event vs non_event are about the same.

df_outcome_w <- df_all %>% select(starts_with("xw"), region, customer, outcome)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "outcome")) 

df_outcome_w %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(outcome), color= as.factor(outcome)), alpha=0.35)  + 
  theme_bw()

df_outcome_w %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_wrap(~name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

df_outcome_w %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_grid( region ~name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

df_outcome_w %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_grid( customer ~ name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

For the sentimentr derived features the 3rd vs the 2nd has the largest middle 50% and some very extreme outliers on the postive side. However, the 2nd feature has larger outliers on the negative side. The medians for all of the features for event vs non_event are all about equal as well.

df_outcome_s <- df_all %>% select(starts_with("xs"), region, customer, outcome)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "outcome")) 

df_outcome_s %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(outcome), color= as.factor(outcome)), alpha=0.35)  + 
  theme_bw()

df_outcome_s %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_wrap(~name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

df_outcome_s %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_grid( region ~name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

df_outcome_s %>% ggplot(mapping = aes(x=value)) +
  geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
  facet_grid( customer ~ name, scales = 'free') +
  scale_color_brewer(palette = 'Set1')

Correlation

corrplot::corrplot(df_all %>% select(starts_with("x")) %>% cor(), type='upper', method='square')

corrplot::corrplot(df_all %>% select(starts_with("xa")) %>% cor(), type='upper', method='square')

corrplot::corrplot(df_all %>% select(starts_with("xb")) %>% cor(), type='upper', method='square')

corrplot::corrplot(df_all %>% select(starts_with("xn")) %>% cor(), type='upper', method='square')

corrplot::corrplot(df_all %>% select(starts_with("xw")) %>% cor(), type='upper', method='square')

corrplot::corrplot(df_all %>% select(starts_with("xs")) %>% cor(), type='upper', method='square')

Input to Output Relationships

df_dense_a <- df_all %>% select(starts_with("xa"), region, customer, response)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response"))

df_all %>% select(starts_with("xa"), region, customer, response)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response")) %>% 
  ggplot(mapping = aes(x=value, y=log(response))) + geom_pointdensity() + facet_wrap(~name, scale='free')

df_dense_a %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_point(mapping = aes(color = region, alpha=.1)) +
  geom_smooth(method = lm, mapping = aes(color = region)) +
  #geom_density(mapping = aes(group = interaction(region), color = as.factor(region)), size = 1.2, adjust = 1.35) +
  #facet_grid( region ~ name, labeller = "label_both", scales = "free", space = "free")
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_a %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth(method = lm, mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_a %>% ggplot(mapping = aes(x=value, y=log10(response))) + 
  geom_smooth( mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_b <- df_all %>% select(starts_with("xb"), region, customer, response)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response"))

df_all %>% select(starts_with("xb"), region, customer, response)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response")) %>% 
  ggplot(mapping = aes(x=value, y=log(response))) + geom_pointdensity() + facet_wrap(~name, scale='free')

df_dense_b %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_point(mapping = aes(color = region, alpha=.1)) +
  geom_smooth(method = lm, mapping = aes(color = region)) +
  #geom_density(mapping = aes(group = interaction(region), color = as.factor(region)), size = 1.2, adjust = 1.35) +
  #facet_grid( region ~ name, labeller = "label_both", scales = "free", space = "free")
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_b %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth( mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_b %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth(method = lm,  mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_b %>% ggplot(mapping = aes(x=value, y=log(response))) + 
  geom_smooth( mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_n <- df_all %>% select(starts_with("xn"), region, customer, response)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response"))

df_all %>% select(starts_with("xn"), region, customer, response)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response")) %>% 
  ggplot(mapping = aes(x=value, y=log(response))) + geom_pointdensity() + facet_wrap(~name, scale='free')

df_dense_n %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_point(mapping = aes(color = region, alpha=.1)) +
  geom_smooth(method = lm, mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_n %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth( mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_n %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth(method = lm,  mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_n %>% ggplot(mapping = aes(x=value, y=log(response))) + 
  geom_smooth( mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_w <- df_all %>% select(starts_with("xw"), region, customer, response)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response"))

df_all %>% select(starts_with("xw"), region, customer, response)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response")) %>% 
  ggplot(mapping = aes(x=value, y=log(response))) + geom_pointdensity() + facet_wrap(~name, scale='free')

df_dense_w %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_point(mapping = aes(color = region, alpha=.1)) +
  geom_smooth(method = lm, mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_w %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth( mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_w %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth(method = lm,  mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_w %>% ggplot(mapping = aes(x=value, y=log(response))) + 
  geom_smooth( mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_s <- df_all %>% select(starts_with("xs"), region, customer, response)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response"))

df_all %>% select(starts_with("xs"), region, customer, response)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response")) %>% 
  ggplot(mapping = aes(x=value, y=log(response))) + geom_pointdensity() + facet_wrap(~name, scale='free')

df_dense_s %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_point(mapping = aes(color = region, alpha=.1)) +
  geom_smooth(method = lm, mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_s %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth( mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_s %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth(method = lm,  mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_s %>% ggplot(mapping = aes(x=value, y=log(response))) + 
  geom_smooth( mapping = aes(color = region)) +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_a %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_point(mapping = aes(color = region, alpha=.1)) +
  geom_smooth(method = lm, mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_a %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth(method = lm, mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_a %>% ggplot(mapping = aes(x=value, y=log10(response))) + 
  geom_smooth( mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_b %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_point(mapping = aes(color = region, alpha=.1)) +
  geom_smooth(method = lm, mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_b %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth(method = lm, mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_b %>% ggplot(mapping = aes(x=value, y=log10(response))) + 
  geom_smooth( mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_n %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_point(mapping = aes(color = region, alpha=.1)) +
  geom_smooth(method = lm, mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_n %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth(method = lm, mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_n %>% ggplot(mapping = aes(x=value, y=log10(response))) + 
  geom_smooth( mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_w %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_point(mapping = aes(color = region, alpha=.1)) +
  geom_smooth(method = lm, mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_w %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth(method = lm, mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_w %>% ggplot(mapping = aes(x=value, y=log10(response))) + 
  geom_smooth( mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df_dense_s %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_point(mapping = aes(color = region, alpha=.1)) +
  geom_smooth(method = lm, mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_s %>% ggplot(mapping = aes(x=value, y=response)) + 
  geom_smooth(method = lm, mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'

df_dense_s %>% ggplot(mapping = aes(x=value, y=log10(response))) + 
  geom_smooth( mapping = aes(color = customer)) +
  scale_color_viridis_d("customer", option = 'inferno') +
  facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Continuous Input and Binary Output

df_y <- df_all %>% mutate(y = ifelse(outcome == "event", 1, 0))
df_y %>% select(starts_with("xa"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
  geom_jitter(height = 0.04) +
  facet_grid(region~name, scales = 'free')

df_y %>% select(starts_with("xa"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
  geom_jitter(height = 0.04) +
  facet_grid(customer~name, scales = 'free')

df_y %>% select(starts_with("xb"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
  geom_jitter(height = 0.04) +
  facet_grid(region~name, scales = 'free')

df_y %>% select(starts_with("xb"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
  geom_jitter(height = 0.04) +
  facet_grid(customer~name, scales = 'free')

df_y %>% select(starts_with("xn"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
  geom_jitter(height = 0.04) +
  facet_grid(region~name, scales = 'free')

df_y %>% select(starts_with("xn"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
  geom_jitter(height = 0.04) +
  facet_grid(customer~name, scales = 'free')

df_y %>% select(starts_with("xw"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
  geom_jitter(height = 0.04) +
  facet_grid(region~name, scales = 'free')

df_y %>% select(starts_with("xw"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
  geom_jitter(height = 0.04) +
  facet_grid(customer~name, scales = 'free')

df_y %>% select(starts_with("xs"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
  geom_jitter(height = 0.04) +
  facet_grid(region~name, scales = 'free')

df_y %>% select(starts_with("xs"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
  geom_jitter(height = 0.04) +
  facet_grid(customer~name, scales = 'free')